1 ST data analysis pipeline (ST-BRCA project)

1.1 Loading packages

library(ggplot2)
library(Seurat)
library(STutility)
library(Matrix)
library(Rcpp)
library(harmony)
library(patchwork)
library(sctransform)
library(png)
library(dplyr)
library(gprofiler2)
library(scCATCH)
library(tidyr)
library(VennDiagram)
# library(cellassign) library(SingleCellExperiment)
# library(tensorflow) library(scater)

1.1.1 Setting global variables

This analysis pipeline works under the assumption that the sample files (Control and Tumor) are stored in each individual folder, one for the control sample, one for the tumor. It will create a list containing all the paths needed for downstream analysis using Stutility and Seurat. A global path needs to be provided (which contains the two sample folders), as well as the id’s for the control and tumor samples.

global.path <- "/mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33"  # Replace with data path
setwd(global.path)  # this sets the file path as the working directory

idControl <- "S33-C-2"  # Name of the control sample
idTumor <- "S33-T-2"  # Name of the tumor sample

In this table we create a data.frame that will contain the to the paths we need as input for our analysis (samples, spot files, images, .json), so we can later make it iterable and run our analysis inside the R markdown file.

samplePaths <- list.files(global.path, full.names = T)
infoTable <- data.frame()

for (sample in 1:length(samplePaths)) {
    newDf <- data.frame(samples = list.files(samplePaths[sample],
        full.names = T, pattern = ".h5"), spotfiles = list.files(samplePaths[sample],
        full.names = T, pattern = ".csv"), imgs = list.files(samplePaths[sample],
        full.names = T, pattern = ".png"), json = list.files(samplePaths[sample],
        full.names = T, pattern = ".json"))
    infoTable <- rbind(infoTable, newDf)
}

1.2 Creating a Seurat Object from the table

se <- InputFromTable(infotable = infoTable, platform = "Visium")
## Using spotfiles to remove spots outside of tissue
## Loading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## Loading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/filtered_feature_bc_matrix.h5 count matrix from a 'Visium' experiment
## 
## ------------- Filtering (not including images based filtering) -------------- 
##   Spots removed:  0  
##   Genes removed:  16692  
## Saving capture area ranges to Staffli object 
## After filtering the dimensions of the experiment is: [19909 genes, 1850 spots]
se$sample_id <- paste0("sample_", GetStaffli(se)@meta.data$sample)
se$orig.ident <- "S33"
st.object <- GetStaffli(se)
st.object
## An object of class Staffli 
## 1850 spots across 2 samples.

1.3 Initial visualization

For an initial view of the distribution of the features along the spots, we use the “ST.FeaturePlot” command.

1.3.1 Feature plot

ST.FeaturePlot(se, features = c("nFeature_RNA"), palette = "Spectral",
    ncol = 2, pt.size = 2.5)

1.3.2 Count plot

ST.FeaturePlot(se, features = c("nCount_RNA"), palette = "Spectral",
    ncol = 2, pt.size = 2.5)

1.4 QC

1.4.1 QC plots

1.4.2 Generate a subset for the control and tumor

# se_subsetControl <- InputFromTable(infoTable[1,])
se_subsetControl <- SubsetSTData(se, expression = sample_id %in%
    "sample_1")
se_subsetControl <- SetIdent(se_subsetControl, value = idControl)
se_subsetControl$sample_id <- idControl
se_subsetControl
## An object of class Seurat 
## 19909 features across 1290 samples within 1 assay 
## Active assay: RNA (19909 features, 0 variable features)
# se_subsetTumor <- InputFromTable(infoTable[2,])
se_subsetTumor <- SubsetSTData(se, expression = sample_id %in%
    "sample_2")
se_subsetTumor <- SetIdent(se_subsetTumor, value = idTumor)
se_subsetTumor$sample_id <- idTumor
se_subsetTumor
## An object of class Seurat 
## 19909 features across 560 samples within 1 assay 
## Active assay: RNA (19909 features, 0 variable features)

1.4.3 Generate QC plotting function

This is unused by now. this generates feature and count distribution in the form of histograms.

generateQCplots <- function(sampleSubset, title, color) {

    p1 <- ggplot() + geom_histogram(data = sampleSubset[[]],
        aes(nFeature_RNA), fill = color, alpha = 0.7, bins = 100) +
        ggtitle("Unique genes per spot")

    p2 <- ggplot() + geom_histogram(data = sampleSubset[[]],
        aes(nCount_RNA), fill = color, alpha = 0.7, bins = 100) +
        ggtitle("Total counts per spots")

    gene_attr <- data.frame(nUMI = Matrix::rowSums(sampleSubset@assays$RNA@counts),
        nSpots = Matrix::rowSums(sampleSubset@assays$RNA@counts >
            0))
    p3 <- ggplot() + geom_histogram(data = gene_attr, aes(nUMI),
        fill = color, alpha = 0.7, bins = 100) + scale_x_log10() +
        ggtitle("Total counts per gene (log10 scale)")

    p4 <- ggplot() + geom_histogram(data = gene_attr, aes(nSpots),
        fill = color, alpha = 0.7, bins = 100) + ggtitle("Total spots per gene")

    (p1 - p2)/(p3 - p4) + plot_annotation(title = title)

}

1.4.4 QC plots

1.4.4.1 Violin QC plots

# mitochondrial genes
mt.genesControl <- grep(pattern = "^MT-", x = rownames(se_subsetControl),
    value = TRUE)
se_subsetControl$percent.mito <- (Matrix::colSums(se_subsetControl@assays$RNA@counts[mt.genesControl,
    ])/Matrix::colSums(se_subsetControl@assays$RNA@counts)) *
    100
# VlnPlot(se_subsetControl, features = c('nCount_RNA',
# 'nFeature_RNA', 'percent.mito'), pt.size = 0.1, ncol = 3)
# + plot_annotation(paste('QC violin plot Control: ',
# idControl))

mt.genesTumor <- grep(pattern = "^MT-", x = rownames(se_subsetTumor),
    value = TRUE)
se_subsetTumor$percent.mito <- (Matrix::colSums(se_subsetTumor@assays$RNA@counts[mt.genesTumor,
    ])/Matrix::colSums(se_subsetTumor@assays$RNA@counts)) * 100
# VlnPlot(se_subsetTumor, features = c('nCount_RNA',
# 'nFeature_RNA', 'percent.mito'), pt.size = 0.1, ncol = 3,
# cols = 'cornflowerblue') + plot_annotation(paste('QC
# violin plot Tumor: ', idTumor))

# ribosomal genes
rb.genesControl <- grep(pattern = "^RP[SL]", x = rownames(se_subsetControl),
    value = TRUE)
se_subsetControl$percent.ribo <- (Matrix::colSums(se_subsetControl@assays$RNA@counts[rb.genesControl,
    ])/Matrix::colSums(se_subsetControl@assays$RNA@counts)) *
    100
VlnPlot(se_subsetControl, features = c("nCount_RNA", "nFeature_RNA",
    "percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4) +
    plot_annotation(paste("QC violin plot Control: ", idControl))

rb.genesTumor <- grep(pattern = "^RP[SL]", x = rownames(se_subsetTumor),
    value = TRUE)
se_subsetTumor$percent.ribo <- (Matrix::colSums(se_subsetTumor@assays$RNA@counts[rb.genesTumor,
    ])/Matrix::colSums(se_subsetTumor@assays$RNA@counts)) * 100
VlnPlot(se_subsetTumor, features = c("nCount_RNA", "nFeature_RNA",
    "percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
    cols = "cornflowerblue") + plot_annotation(paste("QC violin plot Tumor: ",
    idTumor))

1.4.4.2 Merged violin QC plots

merged <- MergeSTData(se_subsetControl, se_subsetTumor)
VlnPlot(merged, features = c("nCount_RNA", "nFeature_RNA", "percent.mito",
    "percent.ribo"), pt.size = 0.1, ncol = 4, group.by = "sample_id",
    cols = c("brown3", "cornflowerblue")) + plot_annotation("QC violin plot")

1.4.4.3 Count vs Feature correlation

These plots will help us with the thresholding:

FeatureScatter(se_subsetControl, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
    cols = "brown3") + plot_annotation(paste("Count vs Gene Scatter Plot ",
    idControl))

FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
    cols = "cornflowerblue") + plot_annotation(paste("Count vs Gene Scatter Plot ",
    idTumor))

#### Count vs Mitochondrial content

p1 <- FeatureScatter(se_subsetControl, feature1 = "nCount_RNA",
    feature2 = "percent.mito", cols = "brown3") + plot_annotation(paste("Count vs % Mitochondrial Scatter Plot ",
    idControl))
p2 <- FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA",
    feature2 = "percent.mito", cols = "cornflowerblue") + plot_annotation(paste("Count vs % Mitochondrial Scatter Plot ",
    idTumor))

p1 - p2

1.4.4.4 Count vs Ribosomal content

FeatureScatter(se_subsetControl, feature1 = "nCount_RNA", feature2 = "percent.ribo",
    cols = "brown3") + plot_annotation(paste("Count vs % Ribosomal Scatter Plot ",
    idControl)) - FeatureScatter(se_subsetTumor, feature1 = "nCount_RNA",
    feature2 = "percent.ribo", cols = "cornflowerblue") + plot_annotation(paste("Count vs % Ribosomal Scatter Plot ",
    idTumor))

1.4.5 QC filtering

This will generate a se subset, a Seurat class object subset that will take the infoTable as input for the corresponding sample and apply filtering based on the following parameters:
- Keeping only the genes that have presence in at least 5 capture spots and a total count value of >= 100 - Keeping the only that spots that contain >= 200 transcripts

1.4.5.1 Filtering spots and genes

Here we are filtering based on a minimum UMI count per spot, minimum gene per spot, and also filtering genes that are not seen in more than N spots.

# Control
se_subsetControlQC <- SubsetSTData(se_subsetControl, expression = nCount_RNA >
    200)
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, expression = nFeature_RNA >
    100)
cat("Spots removed: ", ncol(se_subsetControl) - ncol(se_subsetControlQC),
    "\n")
## Spots removed:  527
# Tumor
se_subsetTumorQC <- SubsetSTData(se_subsetTumor, expression = nCount_RNA >
    200)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, expression = nFeature_RNA >
    100)
cat("Spots removed: ", ncol(se_subsetTumor) - ncol(se_subsetTumorQC),
    "\n")
## Spots removed:  4

1.4.5.2 Mitochondrial and Ribosomal filtering

# Control
se_subsetControlQCmito <- SubsetSTData(se_subsetControlQC, expression = percent.mito <
    15)
se_subsetControlQCribomito <- SubsetSTData(se_subsetControlQCmito,
    expression = percent.ribo < 25)
cat("Spots removed: ", ncol(se_subsetControlQC) - ncol(se_subsetControlQCribomito),
    "\n")
## Spots removed:  48
se_subsetControlQC <- se_subsetControlQCribomito
# Tumor
se_subsetTumorQCmito <- SubsetSTData(se_subsetTumorQC, expression = percent.mito <
    15)
se_subsetTumorQCribomito <- SubsetSTData(se_subsetTumorQCmito,
    expression = percent.ribo < 25)
cat("Spots removed: ", ncol(se_subsetTumorQC) - ncol(se_subsetTumorQCribomito),
    "\n")
## Spots removed:  20
se_subsetTumorQC <- se_subsetTumorQCribomito

1.4.5.3 Filter genes

# Genes distributed in less than N spots
expMatControl <- GetAssayData(se_subsetControlQC, slot = "counts")
expMatTumor <- GetAssayData(se_subsetTumorQC, slot = "counts")
gene.countsControl <- Matrix::rowSums(expMatControl)
gene.countsTumor <- Matrix::rowSums(expMatTumor)
valid.genesControl <- gene.countsControl > 2
valid.genesTumor <- gene.countsTumor > 2
keep.genesControl <- rownames(se_subsetControlQC)[valid.genesControl]
keep.genesTumor <- rownames(se_subsetTumorQC)[valid.genesTumor]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12635
length(keep.genesTumor)
## [1] 15029
# Filter MALAT1

keep.genesControl <- keep.genesControl[!(grepl("MALAT1", keep.genesControl))]
keep.genesTumor <- keep.genesTumor[!(grepl("MALAT1", keep.genesTumor))]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12634
length(keep.genesTumor)
## [1] 15028
# Filter RPL13

keep.genesControl <- keep.genesControl[!(grepl("RPL13", keep.genesControl))]
keep.genesTumor <- keep.genesTumor[!(grepl("RPL13", keep.genesTumor))]
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)
length(keep.genesControl)
## [1] 12631
length(keep.genesTumor)
## [1] 15025
# Update with valid genes
se_subsetControlQC <- SubsetSTData(se_subsetControlQC, features = keep.genesControl)
se_subsetTumorQC <- SubsetSTData(se_subsetTumorQC, features = keep.genesTumor)

1.4.6 QC plots after filtering

1.4.6.1 Violin QC plots (filtered)

# Control
VlnPlot(se_subsetControlQC, features = c("nCount_RNA", "nFeature_RNA",
    "percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4) +
    plot_annotation(paste("QC violin plot Control:", idControl,
        "(filtered)"))

# Tumor
VlnPlot(se_subsetTumorQC, features = c("nCount_RNA", "nFeature_RNA",
    "percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
    cols = "cornflowerblue") + plot_annotation(paste("QC violin plot Tumor:",
    idTumor, "(filtered)"))

1.4.6.2 Merged violin QC plots (filtered)

mergedQC <- MergeSTData(se_subsetControlQC, se_subsetTumorQC)
VlnPlot(mergedQC, features = c("nCount_RNA", "nFeature_RNA",
    "percent.mito", "percent.ribo"), pt.size = 0.1, ncol = 4,
    group.by = "sample_id", cols = c("brown3", "cornflowerblue")) +
    plot_annotation("QC violin plot (filtered)")

1.4.7 Count plots after filtering

ST.FeaturePlot(se_subsetControlQC, features = c("nCount_RNA"),
    palette = "Spectral", ncol = 1, pt.size = 2.5) + plot_annotation(paste("Count spatial plot: ",
    idControl, "(filtered)"))

ST.FeaturePlot(se_subsetTumorQC, features = c("nCount_RNA"),
    palette = "Spectral", ncol = 1, pt.size = 2.5) + plot_annotation(paste("Count spatial plot: ",
    idTumor, "(filtered)"))

1.4.8 Object name change

For simplicity, we are raplacing the name of our QC’d seurat object subsets to something more simple:

seControl <- se_subsetControlQC
seTumor <- se_subsetTumorQC

1.5 Overlaying feature plot with H&E image

# Control
widthControl <- dim(readPNG(infoTable$imgs[1]))[2]
seControl <- LoadImages(seControl, time.resolve = TRUE, verbose = TRUE,
    xdim = widthControl)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
controlPlot <- FeatureOverlay(seControl, features = "nCount_RNA",
    pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)

se_subsetControl <- LoadImages(se_subsetControl, time.resolve = TRUE,
    verbose = TRUE, xdim = widthControl)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
controlPlotUnfiltered <- FeatureOverlay(se_subsetControl, features = "nCount_RNA",
    pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)

# Tumor
widthTumor <- dim(readPNG(infoTable[2, ]$imgs))[1]
seTumor <- LoadImages(seTumor, time.resolve = TRUE, verbose = TRUE,
    xdim = widthTumor)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
tumorPlot <- FeatureOverlay(seTumor, features = "nCount_RNA",
    pt.size = 2.5, palette = "Spectral", type = "raw", pt.alpha = 0.6)

se_subsetTumor <- LoadImages(se_subsetTumor, time.resolve = TRUE,
    verbose = TRUE, xdim = widthTumor)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
tumorPlotUnfiltered <- FeatureOverlay(se_subsetTumor, features = "nCount_RNA",
    pt.size = 2.5, palette = "Spectral", pt.alpha = 0.6, type = "raw")

(controlPlotUnfiltered + ggtitle("Overlay plot S33-C-2 (unfiltered)"))

(controlPlot + ggtitle("Overlay plot S33-C-2 (filtered)"))

(tumorPlotUnfiltered + ggtitle("Overlay plot S33-T-2 (unfiltered)"))

(tumorPlot + ggtitle("Overlay plot S33-T-2 (filtered)"))

1.6 Normalization

Data normalization is made using the SCTransform function included in Seurat. The one being used is Variance Stabilized Transformation (Hafemeister & Satija, 2019).

1.6.1 SCTransform

# Control
seControl <- SCTransform(seControl, return.only.var.genes = FALSE)
seControl
sprintf("the default assay in use is: %s", DefaultAssay(seControl))

# Tumor
seTumor <- SCTransform(seTumor, return.only.var.genes = FALSE)
seTumor
sprintf("the default assay in use is: %s", DefaultAssay(seTumor))

1.7 Top expressed genes

1.7.0.1 Function to get the top 20 genes

We can later use this function in the clusters to extract the top genes in each cluster

get_top20Genes <- function(object, title) {
    C = object@assays$SCT@counts  # WE ARE USING THE NORMALIZED COUNTS
    C@x = C@x/rep.int(colSums(C), diff(C@p))
    most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
    boxplot(as.matrix(t(C[most_expressed, ])), cex = 0.1, las = 1,
        xlab = "% total count per cell", col = (scales::hue_pal())(20)[20:1],
        horizontal = TRUE)
    title(title)
}
pC <- get_top20Genes(seControl, paste("Top 20 expressed genes: ",
    idControl))

pC
## NULL
pT <- get_top20Genes(seTumor, paste("Top 20 expressed genes: ",
    idTumor))

pT
## NULL

1.8 Control and Tumor sample Integration with Harmony

1.8.1 Dimensionality reduction by PCA and Harmony batch correction

# Merge both se objects
mergedPCA <- MergeSTData(seControl, seTumor)
mergedPCA <- SCTransform(mergedPCA, return.only.var.genes = FALSE)
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14256 by 1251
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 1251 cells
## Found 61 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14256 genes
## Computing corrected count matrix for 14256 genes
## Calculating gene attributes
## Wall clock passed: Time difference of 13.83704 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Centering data matrix
## Set default assay to SCT
mergedPCA <- RunPCA(mergedPCA, assay = "SCT")
## PC_ 1 
## Positive:  PIP, SCGB2A2, DCN, S100A6, KRT15, ITM2B, KRT17, KRT14, TAGLN, RPL34 
##     TXNIP, MYL9, CST3, GSN, STC2, CFD, ACTA2, FTH1, IGKC, RPL17 
##     A2M, MT1X, VIM, RPS3A, IGHA1, MIR205HG, SPARCL1, PTN, SAA1, RPS27A 
## Negative:  CD24, CHCHD2, PKM, CBX3, PPIA, CYCS, COL1A1, MT-ATP6, PEG10, PSAP 
##     FN1, HSP90AA1, RPS24, MT-ND2, AZGP1, COL1A2, MT-CO3, VOPP1, MT-ND4, MT-ND3 
##     SCD, COL3A1, RPS29, POSTN, SPARC, CRIP2, CTSD, MT-ND1, APOD, COMP 
## PC_ 2 
## Positive:  PIP, AZGP1, MGP, KRT15, SCGB2A2, STC2, RPL17, RPS4X, PTN, TFF3 
##     TACSTD2, MIR205HG, AGR2, SERPINA3, SERPINA1, MUCL1, PI15, ITM2B, KRT14, KRT17 
##     RPL34, EVL, ELOVL5, MT1X, SEC14L2, CHCHD2, WFDC2, RPL5, SCGB3A1, RPS27A 
## Negative:  CCDC80, COL1A2, MT-ND3, COL1A1, CFD, MT-ND4, C3, BGN, FSTL1, TNXB 
##     CD74, IGFBP7, RNASE1, DCN, COL6A1, TMSB4X, COL6A3, COL6A2, B2M, HLA-E 
##     FTL, NEAT1, C1QA, VIM, SPARC, GPX3, MMP2, CALD1, HLA-B, SAMHD1 
## PC_ 3 
## Positive:  CHCHD2, DCN, RPS24, PEG10, IGHA1, IGKC, GSN, IGFBP6, CFD, VOPP1 
##     CXCL12, C3, FBLN1, TNXB, CD24, JCHAIN, PSAP, CLDN5, PLAC9, GPX3 
##     C1R, S100A6, C1S, EGFL7, SERPING1, VIM, ADH1B, CBX3, DEPP1, IFITM2 
## Negative:  TCN1, MUC1, CALML5, CTSD, SERPINA3, LTF, S100A9, IGF1R, CRIP1, CLEC3A 
##     S100A7, GDF9, LRG1, SLC30A8, APOD, LRRC6, SHROOM1, RPS29, AGR3, SKIL 
##     COX6C, LAPTM4B, CRIP2, PYGL, MT-ND4, MTRNR2L12, ESR1, SERPINA1, CD59, EVL 
## PC_ 4 
## Positive:  DCN, APOD, CRIP1, S100A9, CFD, IGF1R, IGHA1, LTF, CLEC3A, TCN1 
##     FBLN1, IGFBP6, FTL, GDF9, CRIP2, CALML5, C3, MUC1, RPS29, LAPTM4B 
##     SLC30A8, S100A7, S100A6, LRG1, AEBP1, LRRC6, C1S, VIM, C1R, SHROOM1 
## Negative:  CCDC80, COL1A2, COL1A1, NEAT1, SCGB2A2, BGN, CALD1, FSTL1, PIP, MT-ND3 
##     SAMHD1, PDK4, NAP1L1, ACTA2, CD24, KRT15, STC2, MAP1B, POSTN, PTMS 
##     COL6A3, EGR1, ANKRD12, PRRC2C, RBM25, KLF6, THY1, AL627171.2, HSP90AA1, NFIX 
## PC_ 5 
## Positive:  FABP4, SAA1, FHL1, MT-ND4, CFD, PLIN4, PLIN1, CD36, MT-ND3, AL627171.2 
##     CCDC80, ADIRF, CAV1, ADH1B, GPX3, AOC3, LPL, CAVIN2, LIPE, GPD1 
##     TNS1, MGLL, PNPLA2, MT-ND1, MT-CO3, RBP4, NEAT1, SPTBN1, AQP1, PDK4 
## Negative:  COL1A1, FN1, COL1A2, COL3A1, POSTN, HLA-DRA, TAGLN, APOE, KRT17, KRT14 
##     SFRP2, HLA-DRB1, LUM, CCN2, ACTA2, AEBP1, COL6A1, CD74, COL6A2, HLA-DPA1 
##     MYL9, BGN, TPM2, SPARC, LYZ, KRT5, ACTG2, CNN1, THBS1, COL12A1
head(Loadings(mergedPCA, reduction = "pca")[, 1:5])

1.8.1.1 PCA plot of uncorrected Principal Components

plotPCA <- DimPlot(mergedPCA, reduction = "pca", group.by = "sample_id",
    cols = c("brown3", "cornflowerblue")) + ggtitle("PCA plot (Control vs Tumor)")
plotVln <- VlnPlot(mergedPCA, features = "PC_1", group.by = "sample_id",
    cols = c("brown3", "cornflowerblue"))

plotPCA - plotVln

1.8.1.2 Elbow plots

ElbowPlot(mergedPCA) + ggtitle("Elbow Plot")

1.8.1.3 Run Harmony

mergedPCA <- RunHarmony(mergedPCA, group.by.vars = "sample_id",
    assay.use = "SCT", reduction = "pca", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony converged after 2 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

1.8.1.4 PCA plots after correction

plotPCA <- DimPlot(mergedPCA, reduction = "harmony", group.by = "sample_id",
    cols = c("brown3", "cornflowerblue")) + ggtitle("PCA plot (Control vs Tumor)")
plotVln <- VlnPlot(mergedPCA, features = "harmony_1", group.by = "sample_id",
    cols = c("brown3", "cornflowerblue"))

plotPCA - plotVln

1.9 Expression of marker genes

1.9.1 PAM50 genes

pam50 <- c("UBE2T", "BIRC5", "NUF2", "CDC6", "CCNB1", "TYMS",
    "MYBL2", "CEP55", "MELK", "NDC80", "RRM2", "UBE2C", "CENPF",
    "PTTG1", "EXO1", "ORC6L", "ANLN", "CCNE1", "CDC20", "MKI67",
    "KIF2C", "ACTR3B", "MYC", "EGFR", "KRT5", "PHGDH", "CDH3",
    "MIA", "KRT17", "FOXC1", "SFRP1", "KRT14", "ESR1", "SLC39A6",
    "BAG1", "MAPT", "PGR", "CXXC5", "MLPH", "BCL2", "MDM2", "NAT1",
    "FOXA1", "BLVRA", "MMP11", "GPR160", "FGFR4", "GRB7", "TMEM45B",
    "ERBB2")

for (gene in pam50) {

    plot <- tryCatch(ST.FeaturePlot(mergedPCA, features = gene,
        pt.size = 2.5, palette = "Spectral", ncol = 2), error = function(e) NA)
    print(plot)
}

## [1] NA

1.9.2 Markers from Yoosuf et al. 

(https://doi.org/10.1186/s13058-019-1242-9)

# NORAD (LINC00657); COL1A2; SCD
ST.FeaturePlot(mergedPCA, features = c("NORAD"), pt.size = 2.5,
    palette = "Spectral", ncol = 2)

ST.FeaturePlot(mergedPCA, features = c("COL1A2"), pt.size = 2.5,
    palette = "Spectral", ncol = 2)

ST.FeaturePlot(mergedPCA, features = c("SCD"), pt.size = 2.5,
    palette = "Spectral", ncol = 2)

1.9.3 Cancer sample vs non-cancer expression of markers

1.9.3.1 PAM50

VlnPlot(mergedPCA, features = pam50, same.y.lims = TRUE, group.by = "sample_id",
    ncol = 10)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ORC6L

1.9.4 Markers from Yoosuf et al. 

VlnPlot(mergedPCA, features = c("NORAD", "COL1A2", "SCD"), same.y.lims = TRUE,
    group.by = "sample_id", ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

1.10 Clustering the integrated dataset

mergedPCA <- FindNeighbors(mergedPCA, reduction = "harmony",
    dims = 1:8)  ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
mergedPCA <- FindClusters(mergedPCA, verbose = FALSE, resolution = 0.2)  ## CLUSTER FORMATION RESOLUTION
mergedPCA <- RunUMAP(mergedPCA, reduction = "harmony", dims = 1:8)  ## BASED ON ELBOW PLOT
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:32:08 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:32:08 Read 1251 rows and found 8 numeric columns
## 17:32:08 Using Annoy for neighbor search, n_neighbors = 30
## 17:32:08 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:32:08 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e70f7d233
## 17:32:08 Searching Annoy index using 1 thread, search_k = 3000
## 17:32:09 Annoy recall = 100%
## 17:32:09 Commencing smooth kNN distance calibration using 1 thread
## 17:32:10 Initializing from normalized Laplacian + noise
## 17:32:10 Commencing optimization for 500 epochs, with 49376 positive edges
## 17:32:12 Optimization finished

1.10.0.1 UMAP merged plot by sample ID

plt.UMAP.merged <- DimPlot(mergedPCA, reduction = "umap", group.by = "sample_id") +
    ggtitle(paste("UMAP plot Merged(", idControl, "and", idTumor,
        ")"))
plt.UMAP.merged

1.10.0.2 UMAP plot by cluster

plt.UMAP.merged <- DimPlot(mergedPCA, reduction = "umap", split.by = "sample_id") +
    ggtitle(paste("UMAP plot Merged(", idControl, "and", idTumor,
        ")"))
plt.UMAP.merged

1.10.0.3 Spatial UMAP merged

plt.spatialUMAP.merged <- ST.FeaturePlot(mergedPCA, features = "seurat_clusters",
    pt.size = 2.6, palette = "Spectral", ncol = 2) + ggtitle(paste("Spatial UMAP plot Merged(",
    idControl, "and", idTumor, ")"))
plt.spatialUMAP.merged

1.10.0.4 Spatial UMAP by cluster

1.10.0.4.1 Subsetting corrected data
# Control
seControl <- SubsetSTData(mergedPCA, expression = sample_id %in%
    idControl)

# Tumor
seTumor <- SubsetSTData(mergedPCA, expression = sample_id %in%
    idTumor)

1.10.0.5 Split UMAP plot

# Control
ST.FeaturePlot(seControl, features = "seurat_clusters", pt.size = 2.6,
    split.labels = T, ncol = 3) + ggtitle(paste("Spatial UMAP plot ",
    idControl))

# Tumor
ST.FeaturePlot(seTumor, features = "seurat_clusters", pt.size = 2.6,
    split.labels = T, ncol = 3) + ggtitle(paste("Spatial UMAP plot ",
    idTumor))

1.10.0.6 Overlaying UMAP clusters with tissue

# Control
widthControl <- dim(readPNG(infoTable$imgs[1]))[2]
seControl <- LoadImages(seControl, time.resolve = TRUE, verbose = TRUE,
    xdim = widthControl)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-C-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 1957x2000 pixels to 1957x2000 pixels
FeatureOverlay(seControl, features = "seurat_clusters", pt.size = 2.5,
    palette = "Spectral", pt.alpha = 0.4, sample.label = FALSE) +
    ggtitle(paste("Spatial overlay UMAP plot ", idControl))

# Tumor
widthTumor <- dim(readPNG(infoTable[2, ]$imgs))[1]
seTumor <- LoadImages(seTumor, time.resolve = TRUE, verbose = TRUE,
    xdim = widthTumor)
## Loading images for 1 samples: 
##   Reading /mnt/storage/Documents/Academia/Daub_Lab/ST_BRCA_Project/Analysis/Data/patients_selected/S33/S33-T-2/tissue_hires_image.png for sample 1 ... 
##   Scaling down sample 1 image from 2000x1916 pixels to 1916x1836 pixels
FeatureOverlay(seTumor, features = "seurat_clusters", pt.size = 2.5,
    palette = "Spectral", pt.alpha = 0.4, sample.label = FALSE) +
    ggtitle(paste("Spatial overlay UMAP plot ", idTumor))

1.11 Differential expression analysis (DEA)

1.11.1 Find All markers

1.11.1.1 (Pairwise between its cluster and its background (rest of the tissue))

Using FindMarkers function (Seurat). Default test = Wilcoxon Rank Sum test. Using a p-value threshold of 0.01

markersAll <- FindAllMarkers(mergedPCA, assay = "SCT", return.thresh = 0.01)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
markersAll %>%
    group_by(cluster) %>%
    top_n(n = 4, wt = avg_log2FC)

1.11.2 Heatmap

Here we are going to plot a heatmap of the 20 top expressed genes per cluster

top20 <- markersAll %>%
    group_by(cluster) %>%
    top_n(n = 20, wt = avg_log2FC)
DoHeatmap(mergedPCA, features = top20$gene, group.bar.height = 0.005,
    slot = "scale.data") + plot_annotation("Heatmap of differentially expressed genes between clusters")

1.12 Cluster visualizations

1.12.0.1 Function to generate plots for most DE genes per cluster

generateplotsDE <- function(cluster, ngenes, plot) {
    cluster.markers <- FindMarkers(mergedPCA, ident.1 = cluster,
        thresh.use = 0.01, only.pos = T)  # here we extract markers for cluster N but only the up-regulated ones
    # Most DE on cluster N
    cluster.top <- rownames(cluster.markers)[1:ngenes]
    # plot choice (because rmarkdown doesnt let us plot
    # everything with 1 single function run)
    if (plot == "violin") {
        VlnPlot(mergedPCA, features = c(cluster.top), same.y.lims = TRUE,
            ncol = 5) + theme(legend.position = "right")
    } else if (plot == "umap") {
        FeaturePlot(mergedPCA, features = cluster.top, ncol = 5)
    } else if (plot == "spatial") {
        ST.FeaturePlot(mergedPCA, features = cluster.top, palette = "Spectral",
            pt.size = 2.5)
    }
}

1.12.1 Cluster 0

1.12.1.1 Top 5 genes

1.12.1.1.1 Violin plot
generateplotsDE(0, 10, "violin")

1.12.1.1.2 UMAP plot
generateplotsDE(0, 10, "umap")

1.12.1.1.3 Spatial plot
generateplotsDE(0, 5, "spatial")

1.12.2 Cluster 1

1.12.2.1 Top 5 genes

1.12.2.1.1 Violin plot
generateplotsDE(1, 10, "violin")

1.12.2.1.2 UMAP plot
generateplotsDE(1, 10, "umap")

1.12.2.1.3 Spatial plot
generateplotsDE(1, 5, "spatial")

1.12.3 Cluster 2

1.12.3.1 Top 5 genes

1.12.3.1.1 Violin plot
generateplotsDE(2, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

1.12.3.1.2 UMAP plot
generateplotsDE(2, 10, "umap")

1.12.3.1.3 Spatial plot
generateplotsDE(2, 5, "spatial")

1.12.4 Cluster 3

1.12.4.1 Top 5 genes

1.12.4.1.1 Violin plot
generateplotsDE(3, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

1.12.4.1.2 UMAP plot
generateplotsDE(3, 10, "umap")

1.12.4.1.3 Spatial plot
generateplotsDE(3, 5, "spatial")

1.12.5 Cluster 4

1.12.5.1 Top 5 genes

1.12.5.1.1 Violin plot
generateplotsDE(4, 10, "violin")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

1.12.5.1.2 UMAP plot
generateplotsDE(4, 10, "umap")

1.12.5.1.3 Spatial plot
generateplotsDE(4, 5, "spatial")

1.13 Cluster annotation

1.13.1 Functional Enrichment Analysis (FEA)

1.13.1.1 Marker subset for each cluster

# Create marker subset for each cluster
markers0 <- subset(markersAll, cluster == "0") %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
    pull(gene)
markers1 <- subset(markersAll, cluster == "1") %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
    pull(gene)
markers2 <- subset(markersAll, cluster == "2") %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
    pull(gene)
markers3 <- subset(markersAll, cluster == "3") %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
    pull(gene)
markers4 <- subset(markersAll, cluster == "4") %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>%
    pull(gene)
# markers5 <- subset(markersAll, cluster == '5') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers6 <- subset(markersAll, cluster == '6') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers7 <- subset(markersAll, cluster == '7') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers8 <- subset(markersAll, cluster == '8') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers9 <- subset(markersAll, cluster == '9') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)
# markers10 <- subset(markersAll, cluster == '10') %>%
# filter(p_val_adj < 0.05, avg_log2FC > 0.5) %>% pull(gene)

1.13.1.2 Run FEA

Here we run FEA using gprofiler2, and the source we use is Gene Ontology: Biological Process (GO:BP)

go0 <- gost(query = markers0, organism = "hsapiens", significant = TRUE,
    sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
        "WP"))
go1 <- gost(query = markers1, organism = "hsapiens", significant = TRUE,
    sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
        "WP"))
go2 <- gost(query = markers2, organism = "hsapiens", significant = TRUE,
    sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
        "WP"))
go3 <- gost(query = markers3, organism = "hsapiens", significant = TRUE,
    sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
        "WP"))
go4 <- gost(query = markers4, organism = "hsapiens", significant = TRUE,
    sources = c("GO:MF", "GO:BP", "KEGG", "REAC", "HP", "HPA",
        "WP"))
# go5 <- gost(query = markers5, organism = 'hsapiens',
# significant = TRUE, sources = c('GO', 'KEGG', 'REAC',
# 'TF', 'MIRNA', 'CORUM', 'HP', 'HPA', 'WP')) go6 <-
# gost(query = markers6, organism = 'hsapiens', significant
# = TRUE, sources = c('GO', 'KEGG', 'REAC', 'TF', 'MIRNA',
# 'CORUM', 'HP', 'HPA', 'WP')) go7 <- gost(query =
# markers7, organism = 'hsapiens', significant = TRUE,
# sources = 'GO:BP') go8 <- gost(query = markers8, organism
# = 'hsapiens', significant = TRUE, sources = 'GO:BP') go9
# <- gost(query = markers9, organism = 'hsapiens',
# significant = TRUE, sources = 'GO:BP') go10 <- gost(query
# = markers10, organism = 'hsapiens', significant = TRUE,
# sources = 'GO:BP')
1.13.1.2.1 View results
go0$result
go1$result
go2$result
go3$result
go4$result
# go5$result go6$result go7$result go8$result go9$result
# go10$result

1.13.2 Marker gene expression in clusters

1.13.2.1 PAM50

VlnPlot(mergedPCA, features = pam50, same.y.lims = TRUE, ncol = 10)
## Warning in FetchData(object = object, vars = features, slot = slot): The
## following requested variables were not found: ORC6L

1.13.2.2 Yoosuf et al. 

VlnPlot(mergedPCA, features = c("NORAD", "COL1A2", "SCD"), same.y.lims = TRUE,
    ncol = 3)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

1.14 Healthy cluster heterogeneity

1.14.1 Subclustering of healthy regions

Subclustering healthy areas that span both samples to see how differently they cluster.

# dimensional reduction and batch correction
healthy <- SubsetSTData(mergedPCA, idents = c(0))
healthy <- RunPCA(healthy, assay = "SCT")
## PC_ 1 
## Positive:  DCN, CFD, C3, IGFBP6, S100A6, VIM, TNXB, GSN, EGFL7, IGFBP7 
##     CLDN5, CXCL12, SERPING1, FBLN1, TXNIP, GPX3, SPARCL1, C1S, PLAC9, C1R 
##     DEPP1, IGHA1, A2M, IFITM2, IFITM3, MYL9, ADIRF, AEBP1, CST3, IGKC 
## Negative:  CD24, PKM, AZGP1, CHCHD2, CBX3, COL1A1, MT-ATP6, PPIA, CYCS, HSP90AA1 
##     PEG10, MT-ND4, MT-CO3, COL1A2, MGP, MT-ND2, PSAP, MT-ND3, RPS24, CTSD 
##     VOPP1, POSTN, MTRNR2L12, MUC1, FN1, NEAT1, SCD, RPS29, SCN5A, PRRC2C 
## PC_ 2 
## Positive:  PIP, RPL17, DCN, RPS27A, RPL34, MGP, KRT15, RPS8, IGHA1, RPL11 
##     AZGP1, RPL5, KRT17, RPS4X, ITM2B, MUCL1, KRT14, RPS3A, TFF3, RPL29 
##     S100A6, FTH1, CST3, RPS17, RPL30, RPSA, SERPINA3, RPL28, TACSTD2, STC2 
## Negative:  CCDC80, NEAT1, COL1A2, BGN, MT-ND3, FSTL1, SAMHD1, PDK4, COL1A1, MT-ND4 
##     CALD1, COL6A3, NAP1L1, EGR1, KLF6, PTMS, NFIX, MAP1B, CTSZ, EIF5B 
##     AL627171.2, PRRC2C, ANKRD12, FOS, RBM25, HSP90AA1, FKBP1A, TMSB4X, MTRNR2L12, THY1 
## PC_ 3 
## Positive:  DCN, IGKC, C3, CCDC80, IGFBP6, FBLN1, SFRP4, IGHA1, TNXB, SFRP2 
##     C1S, C1R, MMP2, PLTP, CXCL12, SELENOP, AEBP1, LUM, JCHAIN, IGFBP4 
##     RARRES2, GPNMB, CFD, CTSK, SERPING1, CCN5, GSN, SFRP1, OGN, AKAP12 
## Negative:  AQP1, IGFBP7, CLDN5, PECAM1, SPARCL1, ACKR1, HLA-E, A2M, ACTA2, EGFL7 
##     MYL9, GNG11, IFITM2, EPAS1, IGFBP3, B2M, HSPG2, FLNA, VWF, CD93 
##     CD24, ADIRF, SNCG, ARAP3, FABP4, MMRN2, APOLD1, TAGLN, ADGRL4, IFITM1 
## PC_ 4 
## Positive:  IGKC, IGHA1, JCHAIN, ACKR1, VWF, IGLC2, IGHG1, HLA-DRA, AQP1, C1S 
##     IGLC1, CD74, GNG11, IGHG3, FABP4, IGLC3, PECAM1, SPRY1, SERPING1, CEBPD 
##     IGHG4, B2M, MBNL1, TGFBR2, MED13L, HLA-B, IL1R1, ZNF385D, CALD1, EGFL7 
## Negative:  TNXB, MYL9, COL6A2, COL1A2, TAGLN, IGFBP6, COL1A1, ELN, A2M, MYH11 
##     TIMP2, FN1, C3, CPE, SERPINF1, CLDN5, AEBP1, MRC2, OGN, SFRP1 
##     SOD3, DPYSL2, DCN, MFAP4, CLEC3B, NEXN, DKK3, COL3A1, S100A4, CXCL12 
## PC_ 5 
## Positive:  RNASE1, CFD, AQP1, ACKR1, CCL14, PLVAP, C1QA, TXNIP, PDK4, CD74 
##     SELENOP, C1QB, IFITM1, CTSC, CST3, TNXB, CLEC3B, S100A10, TGFBR2, TNFAIP2 
##     KLF4, C1QC, VWF, ROBO4, PLTP, FTL, CCN5, IFITM2, DST, REX1BD 
## Negative:  IGKC, JCHAIN, IGHA1, CLDN5, MYL9, ACTA2, FLNA, MYH11, TAGLN, IGFBP3 
##     BGN, DEPP1, MUSTN1, GPX3, MYLK, TIMP1, IGFBP6, COL18A1, LRP1, COL14A1 
##     CALD1, LIMS2, AEBP1, ADH1B, CD151, COL4A1, MYO1C, GAS6, SEMA3G, ID1
ElbowPlot(healthy) + ggtitle("Elbow Plot")

healthy <- RunHarmony(healthy, group.by.vars = "sample_id", assay.use = "SCT",
    reduction = "pca", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

# clustering
healthy <- FindNeighbors(healthy, reduction = "harmony", dims = 1:6)  ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
healthy <- FindClusters(healthy, verbose = FALSE, resolution = 0.2)  ## CLUSTER FORMATION RESOLUTION
healthy <- RunUMAP(healthy, reduction = "harmony", dims = 1:6)  ## BASED ON ELBOW PLOT
## 17:33:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:33:37 Read 577 rows and found 6 numeric columns
## 17:33:37 Using Annoy for neighbor search, n_neighbors = 30
## 17:33:37 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:33:37 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e653e3678
## 17:33:37 Searching Annoy index using 1 thread, search_k = 3000
## 17:33:37 Annoy recall = 100%
## 17:33:38 Commencing smooth kNN distance calibration using 1 thread
## 17:33:38 Initializing from normalized Laplacian + noise
## 17:33:39 Commencing optimization for 500 epochs, with 21156 positive edges
## 17:33:40 Optimization finished
DimPlot(healthy, reduction = "umap") + ggtitle("Subclustering of healthy clusters")

1.14.1.1 Mapping to tissue

ST.FeaturePlot(healthy, features = "seurat_clusters", pt.size = 2.6,
    ncol = 2) + ggtitle("Mapping of healthy subclusters")

1.14.1.2 Cluster distribution across samples

meta.data <- healthy[[]]
counts <- group_by(meta.data, sample_id, seurat_clusters) %>%
    summarise(count = n())
## `summarise()` has grouped output by 'sample_id'. You can override using the `.groups` argument.
ggplot(counts, aes(sample_id, count, fill = seurat_clusters)) +
    geom_col(position = "fill") + scale_y_continuous(labels = scales::percent) +
    ggtitle("Cluster distribution across samples")

1.14.2 Marker gene expression in healthy areas between samples

1.14.2.1 Markers and heatmap

healthy <- SetIdent(healthy, value = "sample_id")
markersAll_healthy <- FindAllMarkers(healthy, assay = "SCT",
    return.thresh = 0.01)
## Calculating cluster S33-C-2
## Calculating cluster S33-T-2
markersAll_healthy %>%
    group_by(cluster) %>%
    top_n(n = 4, wt = avg_log2FC)
top20_healthy <- markersAll_healthy %>%
    group_by(cluster) %>%
    top_n(n = 20, wt = avg_log2FC)
DoHeatmap(healthy, features = top20_healthy$gene, group.bar.height = 0.005,
    slot = "scale.data", group.by = "sample_id") + plot_annotation("Heatmap of differentially expressed genes across healthy regions")

1.14.2.2 Plotting selected cancer marker genes

Cancer marker genes that are overexpressed in the healthy tissue area compared to the control sample.

top.healthy <- top20_healthy$gene[21:30]
VlnPlot(healthy, features = top.healthy, group.by = "sample_id",
    assay = "SCT", ncol = 5, same.y.lims = TRUE) + plot_annotation("Differentially expressed cancer genes in healthy areas aross samples")
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing missing values (geom_point).

# VlnPlot(healthy, features = c('PKM', 'CD24', 'FN1',
# 'CBX3'), group.by = 'sample_id') +
# plot_annotation('Differentially expressed cancer genes in
# healthy areas aross samples')

1.15 Clustering without control sample

seTumor <- RunPCA(seTumor, verbose = FALSE, assay = "SCT")
ElbowPlot(seTumor) + ggtitle("Elbow plot")

seTumor <- FindNeighbors(seTumor, reduction = "pca", dims = 1:7)  ## BASED ON ELBOW PLOT
## Computing nearest neighbor graph
## Computing SNN
seTumor <- FindClusters(seTumor, verbose = FALSE, resolution = 0.5)  ## CLUSTER FORMATION RESOLUTION
seTumor <- RunUMAP(seTumor, reduction = "pca", dims = 1:7)  ## BASED ON ELBOW PLOT
## 17:33:46 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:33:46 Read 536 rows and found 7 numeric columns
## 17:33:46 Using Annoy for neighbor search, n_neighbors = 30
## 17:33:46 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:33:46 Writing NN index file to temp file /tmp/Rtmpi7ML7W/file716e22826d92
## 17:33:46 Searching Annoy index using 1 thread, search_k = 3000
## 17:33:46 Annoy recall = 100%
## 17:33:46 Commencing smooth kNN distance calibration using 1 thread
## 17:33:47 Initializing from normalized Laplacian + noise
## 17:33:47 Commencing optimization for 500 epochs, with 20506 positive edges
## 17:33:48 Optimization finished
DimPlot(seTumor, reduction = "umap") + ggtitle("UMAP plot (Control-independent)")

ST.FeaturePlot(seTumor, features = "seurat_clusters", pt.size = 2.5,
    ncol = 1)

1.15.1 Cluster-specific marker genes

markersAll_tumor <- FindAllMarkers(seTumor, assay = "SCT", return.thresh = 0.01)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
markersAll_tumor %>%
    group_by(cluster) %>%
    top_n(n = 4, wt = avg_log2FC)

1.15.1.1 Heatmap

top20 <- markersAll_tumor %>%
    group_by(cluster) %>%
    top_n(n = 20, wt = avg_log2FC)
DoHeatmap(seTumor, features = top20$gene, group.bar.height = 0.005,
    slot = "scale.data") + plot_annotation("Heatmap of differentially expressed genes between clusters")

1.16 DE comparison between tumour healthy versus tumour control

In this segment, we selected and merged cancer regions and performed DEA. For the modalities of cancer + control and cancer only, the clusters will be merged into cancer regions and cancer-devoid regions based on the previous marker genes. The DE of both modalities will be compared.

1.16.1 DE in Cancer-only sample

# Merging cancer clusters (by renaming)
seTumor_merged <- seTumor
new.cluster.ids <- c("healthy1", "tumor1", "tumor2", "tumor3",
    "tumor4", "adipose")
names(new.cluster.ids) <- levels(seTumor_merged)
seTumor_merged <- RenameIdents(seTumor, new.cluster.ids)
tumor.vs.tumor.markers <- FindMarkers(seTumor_merged, ident.1 = c("tumor1",
    "tumor2", "tumor3", "tumor4"), ident.2 = "healthy1", assay = "SCT")

1.16.1.1 Top marker genes

top20_ci <- tumor.vs.tumor.markers %>%
    top_n(n = 20, wt = avg_log2FC)
DoHeatmap(seTumor_merged, features = rownames(top20_ci), group.bar.height = 0.005,
    slot = "scale.data") + plot_annotation("Grouped Tumor vs Healthy area (tumor-only)")

1.16.2 DE including Control sample

# Merging cancer and healthy clusters (by renaming)
merged_merged <- mergedPCA
new.cluster.ids <- c("healthy1", "tumor1", "healthy2", "tumor2",
    "adipose")
names(new.cluster.ids) <- levels(merged_merged)
merged_merged <- RenameIdents(merged_merged, new.cluster.ids)
levels(merged_merged) <- c("healthy1", "healthy2", "tumor1",
    "tumor2", "adipose")
# tumor.vs.control.markers <- FindMarkers(merged_merged,
# ident.1 = 'tumor', ident.2 = 'healthy')
tumor.vs.control.markers <- FindMarkers(merged_merged, ident.1 = c("tumor1",
    "tumor2"), ident.2 = c("healthy1", "healthy2"))

1.16.2.1 Top marker genes

top20_cd <- tumor.vs.control.markers %>%
    top_n(n = 20, wt = avg_log2FC)
DoHeatmap(merged_merged, features = rownames(top20_cd), group.bar.height = 0.005,
    slot = "scale.data") + plot_annotation("Grouped Tumor vs Healthy area (control-integrated dataset)")

1.16.2.2 Venn diagram ALL

cd <- c(rownames(tumor.vs.control.markers))
ci <- c(rownames(tumor.vs.tumor.markers))
x <- list(cd, ci)


# venn.diagram(list(cd,ci), filename= 'vennTest',
# category.names = c('Control-dependent' ,
# 'Control-Independent'), output=TRUE, disable.logging =
# TRUE)

v <- venn.diagram(x, filename = NULL, category.names = c("With Control",
    "Tumour only"), disable.logging = TRUE, fill = c("blue",
    "red"))
## INFO [2022-01-12 17:33:54] [[1]]
## INFO [2022-01-12 17:33:54] x
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $filename
## INFO [2022-01-12 17:33:54] NULL
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $category.names
## INFO [2022-01-12 17:33:54] c("With Control", "Tumour only")
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $disable.logging
## INFO [2022-01-12 17:33:54] [1] TRUE
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $fill
## INFO [2022-01-12 17:33:54] c("blue", "red")
## INFO [2022-01-12 17:33:54]
# have a look at the default plot
grid.newpage()
grid.draw(v)

# have a look at the names in the plot object v
lapply(v, names)
## [[1]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[2]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[3]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[4]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[5]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[6]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[7]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[8]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[9]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"
# We are interested in the labels
lapply(v, function(i) i$label)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## [1] "129"
## 
## [[6]]
## [1] "102"
## 
## [[7]]
## [1] "156"
## 
## [[8]]
## [1] "With Control"
## 
## [[9]]
## [1] "Tumour only"
# Over-write labels (5 to 7 chosen by manual check of
# labels) in foo only
v[[5]]$label <- paste(setdiff(cd, ci), collapse = "\n")
# in baa only
v[[6]]$label <- paste(setdiff(ci, cd), collapse = "\n")
# intesection
v[[7]]$label <- paste(intersect(cd, ci), collapse = "\n")

# plot grid.newpage() grid.draw(v)

1.16.2.3 Venn diagram top 20

cd2 <- c(rownames(top20_cd))
ci2 <- c(rownames(top20_ci))
x <- list(cd, ci)


# venn.diagram(list(cd,ci), filename= 'vennTest',
# category.names = c('Control-dependent' ,
# 'Control-Independent'), output=TRUE, disable.logging =
# TRUE)

v <- venn.diagram(x, filename = NULL, category.names = c("With Control",
    "Tumour only"), disable.logging = TRUE, fill = c("blue",
    "red"))
## INFO [2022-01-12 17:33:54] [[1]]
## INFO [2022-01-12 17:33:54] x
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $filename
## INFO [2022-01-12 17:33:54] NULL
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $category.names
## INFO [2022-01-12 17:33:54] c("With Control", "Tumour only")
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $disable.logging
## INFO [2022-01-12 17:33:54] [1] TRUE
## INFO [2022-01-12 17:33:54] 
## INFO [2022-01-12 17:33:54] $fill
## INFO [2022-01-12 17:33:54] c("blue", "red")
## INFO [2022-01-12 17:33:54]
# have a look at the default plot
grid.newpage()
grid.draw(v)

# have a look at the names in the plot object v
lapply(v, names)
## [[1]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[2]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[3]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[4]]
## [1] "x"          "y"          "id"         "id.lengths" "name"      
## [6] "gp"         "vp"         "params"    
## 
## [[5]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[6]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[7]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[8]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"           
## 
## [[9]]
##  [1] "label"         "x"             "y"             "just"         
##  [5] "hjust"         "vjust"         "rot"           "check.overlap"
##  [9] "name"          "gp"            "vp"
# We are interested in the labels
lapply(v, function(i) i$label)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## [1] "129"
## 
## [[6]]
## [1] "102"
## 
## [[7]]
## [1] "156"
## 
## [[8]]
## [1] "With Control"
## 
## [[9]]
## [1] "Tumour only"
# Over-write labels (5 to 7 chosen by manual check of
# labels) in foo only
v[[5]]$label <- paste(setdiff(cd2, ci2), collapse = "\n")
# in baa only
v[[6]]$label <- paste(setdiff(ci2, cd2), collapse = "\n")
# intesection
v[[7]]$label <- paste(intersect(cd2, ci2), collapse = "\n")

# plot
grid.newpage()
grid.draw(v)